Load library packages

library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
-- Attaching packages ---------------------------------------------------------------------------- tidyverse 1.3.1 --
v ggplot2 3.3.3     v purrr   0.3.4
v tibble  3.1.2     v dplyr   1.0.6
v tidyr   1.1.3     v stringr 1.4.0
v readr   1.4.0     v forcats 0.5.1
-- Conflicts ------------------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
library(skimr)
library(tidymodels)
Registered S3 method overwritten by 'tune':
  method                   from   
  required_pkgs.model_spec parsnip
-- Attaching packages --------------------------------------------------------------------------- tidymodels 0.1.3 --
v broom        0.7.6      v rsample      0.1.0 
v dials        0.0.9      v tune         0.1.5 
v infer        0.5.4      v workflows    0.2.2 
v modeldata    0.1.0      v workflowsets 0.0.2 
v parsnip      0.1.6      v yardstick    0.0.8 
v recipes      0.1.16     
-- Conflicts ------------------------------------------------------------------------------ tidymodels_conflicts() --
x recipes::check()  masks devtools::check()
x scales::discard() masks purrr::discard()
x dplyr::filter()   masks stats::filter()
x recipes::fixed()  masks stringr::fixed()
x dplyr::lag()      masks stats::lag()
x yardstick::spec() masks readr::spec()
x recipes::step()   masks stats::step()
* Use tidymodels_prefer() to resolve common conflicts.
library(corrplot)
corrplot 0.89 loaded

5.14 Case studies

https://jhudatascience.org/tidyversecourse/model.html#case-studies-4

Predicting Annual Air Pollution (case study 1)

the data

There are 48 predictors with values for 876 monitors (observations). The data comes from the US Environmental Protection Agency (EPA), the National Aeronautics and Space Administration (NASA), the US Census, and the National Center for Health Statistics (NCHS).

https://jhudatascience.org/tidyversecourse/model.html#data-import

pm <- read_csv("data/pm25_data.csv")

-- Column specification ---------------------------------------------------------------------------------------------
cols(
  .default = col_double(),
  state = col_character(),
  county = col_character(),
  city = col_character()
)
i Use `spec()` for the full column specifications.
glimpse(pm)
Rows: 876
Columns: 50
$ id                          <dbl> 1003.001, 1027.000, 1033.100, 1049.100, 1055.001, 1069.000, 1073.002, 1073.101,~
$ value                       <dbl> 9.597647, 10.800000, 11.212174, 11.659091, 12.375394, 10.508850, 15.591017, 12.~
$ fips                        <dbl> 1003, 1027, 1033, 1049, 1055, 1069, 1073, 1073, 1073, 1073, 1073, 1073, 1073, 1~
$ lat                         <dbl> 30.49800, 33.28126, 34.75878, 34.28763, 33.99375, 31.22636, 33.55306, 33.33111,~
$ lon                         <dbl> -87.88141, -85.80218, -87.65056, -85.96830, -85.99107, -85.39077, -86.81500, -8~
$ state                       <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "A~
$ county                      <chr> "Baldwin", "Clay", "Colbert", "DeKalb", "Etowah", "Houston", "Jefferson", "Jeff~
$ city                        <chr> "Fairhope", "Ashland", "Muscle Shoals", "Crossville", "Gadsden", "Dothan", "Bir~
$ CMAQ                        <dbl> 8.098836, 9.766208, 9.402679, 8.534772, 9.241744, 9.121892, 10.235612, 10.23561~
$ zcta                        <dbl> 36532, 36251, 35660, 35962, 35901, 36303, 35207, 35111, 35444, 35094, 35064, 35~
$ zcta_area                   <dbl> 190980522, 374132430, 16716984, 203836235, 154069359, 162685124, 26929603, 1662~
$ zcta_pop                    <dbl> 27829, 5103, 9042, 8300, 20045, 30217, 9010, 16140, 3699, 14212, 11458, 32390, ~
$ imp_a500                    <dbl> 0.01730104, 1.96972318, 19.17301038, 5.78200692, 16.49307958, 19.13927336, 41.8~
$ imp_a1000                   <dbl> 1.40960208, 0.85315744, 11.14489619, 3.86764706, 11.09645329, 16.67149654, 41.0~
$ imp_a5000                   <dbl> 3.33601181, 0.98514787, 15.17861538, 1.23114143, 14.67778782, 16.38548211, 36.0~
$ imp_a10000                  <dbl> 1.9879187, 0.5208189, 9.7253870, 1.0316469, 8.2200785, 8.0739624, 23.1790413, 4~
$ imp_a15000                  <dbl> 1.4386207, 0.3359198, 5.2472094, 0.9730444, 5.1612102, 4.7401296, 17.4524844, 4~
$ county_area                 <dbl> 4117521611, 1564252280, 1534877333, 2012662359, 1385618994, 1501737720, 2878192~
$ county_pop                  <dbl> 182265, 13932, 54428, 71109, 104430, 101547, 658466, 658466, 194656, 658466, 65~
$ log_dist_to_prisec          <dbl> 4.648181, 7.219907, 5.760131, 3.721489, 5.261457, 7.112373, 6.600958, 6.526896,~
$ log_pri_length_5000         <dbl> 8.517193, 8.517193, 8.517193, 8.517193, 9.066563, 8.517193, 11.156977, 10.91506~
$ log_pri_length_10000        <dbl> 9.210340, 9.210340, 9.274303, 10.409411, 11.137360, 9.210340, 11.849658, 11.728~
$ log_pri_length_15000        <dbl> 9.630228, 9.615805, 9.658899, 11.173626, 11.587258, 9.615805, 12.401245, 12.230~
$ log_pri_length_25000        <dbl> 11.32735, 10.12663, 10.15769, 11.90959, 12.01356, 10.12663, 12.98762, 12.86472,~
$ log_prisec_length_500       <dbl> 7.295356, 6.214608, 8.611945, 7.310155, 8.740680, 6.214608, 6.214608, 6.214608,~
$ log_prisec_length_1000      <dbl> 8.195119, 7.600902, 9.735569, 8.585843, 9.627898, 7.600902, 9.075921, 8.949239,~
$ log_prisec_length_5000      <dbl> 10.815042, 10.170878, 11.770407, 10.214200, 11.728889, 12.298627, 12.281645, 11~
$ log_prisec_length_10000     <dbl> 11.886803, 11.405543, 12.840663, 11.508944, 12.768279, 12.994141, 13.278416, 12~
$ log_prisec_length_15000     <dbl> 12.205723, 12.042963, 13.282656, 12.353663, 13.189489, 13.326132, 13.826365, 12~
$ log_prisec_length_25000     <dbl> 13.41395, 12.79980, 13.79973, 13.55979, 13.70026, 13.85550, 14.45221, 13.80017,~
$ log_nei_2008_pm25_sum_10000 <dbl> 0.318035438, 3.218632928, 6.573127301, 0.000000000, 3.772775514, 0.909136688, 7~
$ log_nei_2008_pm25_sum_15000 <dbl> 1.967358961, 3.218632928, 6.581917457, 3.282789151, 3.785646441, 3.174609213, 8~
$ log_nei_2008_pm25_sum_25000 <dbl> 5.067308, 3.218633, 6.875900, 4.887665, 4.160506, 3.190008, 8.651068, 7.806773,~
$ log_nei_2008_pm10_sum_10000 <dbl> 1.35588511, 3.31111648, 6.69187313, 0.00000000, 4.43719884, 0.92888890, 8.20975~
$ log_nei_2008_pm10_sum_15000 <dbl> 2.26783411, 3.31111648, 6.70127741, 3.35004443, 4.46267932, 3.67473904, 8.64883~
$ log_nei_2008_pm10_sum_25000 <dbl> 5.628728, 3.311116, 7.148858, 5.171920, 4.678311, 3.744629, 8.858019, 8.021072,~
$ popdens_county              <dbl> 44.265706, 8.906492, 35.460814, 35.330814, 75.367038, 67.619664, 228.777633, 22~
$ popdens_zcta                <dbl> 145.7164307, 13.6395554, 540.8870404, 40.7189625, 130.1037411, 185.7391706, 334~
$ nohs                        <dbl> 3.3, 11.6, 7.3, 14.3, 4.3, 5.8, 7.1, 2.7, 11.1, 7.2, 2.7, 0.8, 2.8, 9.7, 1.2, 3~
$ somehs                      <dbl> 4.9, 19.1, 15.8, 16.7, 13.3, 11.6, 17.1, 6.6, 11.6, 12.2, 9.8, 2.6, 8.2, 21.6, ~
$ hs                          <dbl> 25.1, 33.9, 30.6, 35.0, 27.8, 29.8, 37.2, 30.7, 46.0, 32.2, 28.8, 12.9, 32.0, 3~
$ somecollege                 <dbl> 19.7, 18.8, 20.9, 14.9, 29.2, 21.4, 23.5, 25.7, 17.2, 19.0, 26.3, 17.9, 28.9, 2~
$ associate                   <dbl> 8.2, 8.0, 7.6, 5.5, 10.1, 7.9, 7.3, 8.0, 4.1, 6.8, 12.4, 5.2, 7.4, 5.2, 6.5, 6.~
$ bachelor                    <dbl> 25.3, 5.5, 12.7, 7.9, 10.0, 13.7, 5.9, 17.6, 7.1, 14.8, 11.6, 35.5, 13.5, 2.2, ~
$ grad                        <dbl> 13.5, 3.1, 5.1, 5.8, 5.4, 9.8, 2.0, 8.7, 2.9, 7.7, 8.3, 25.2, 7.1, 0.4, 23.3, 4~
$ pov                         <dbl> 6.1, 19.5, 19.0, 13.8, 8.8, 15.6, 25.5, 7.3, 8.1, 10.5, 18.1, 2.1, 5.1, 13.3, 5~
$ hs_orless                   <dbl> 33.3, 64.6, 53.7, 66.0, 45.4, 47.2, 61.4, 40.0, 68.7, 51.6, 41.3, 16.3, 43.0, 7~
$ urc2013                     <dbl> 4, 6, 4, 6, 4, 4, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 4, 3, 1, 5, 4, 2, 6, 4, 4~
$ urc2006                     <dbl> 5, 6, 4, 5, 4, 4, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 4, 3, 1, 5, 4, 2, 6, 5, 4~
$ aod                         <dbl> 37.363636, 34.818182, 36.000000, 33.083333, 43.416667, 33.000000, 39.583333, 38~
pm <-pm %>%
  mutate(across(c(id, fips, zcta), as_factor)) 
skim(pm)
-- Data Summary ------------------------
                           Values
Name                       pm    
Number of rows             876   
Number of columns          50    
_______________________          
Column type frequency:           
  character                3     
  factor                   3     
  numeric                  44    
________________________         
Group variables            None  

-- Variable type: character -----------------------------------------------------------------------------------------
# A tibble: 3 x 8
  skim_variable n_missing complete_rate   min   max empty n_unique whitespace
* <chr>             <int>         <dbl> <int> <int> <int>    <int>      <int>
1 state                 0             1     4    20     0       49          0
2 county                0             1     3    20     0      471          0
3 city                  0             1     4    48     0      607          0

-- Variable type: factor --------------------------------------------------------------------------------------------
# A tibble: 3 x 6
  skim_variable n_missing complete_rate ordered n_unique top_counts                      
* <chr>             <int>         <dbl> <lgl>      <int> <chr>                           
1 id                    0             1 FALSE        876 100: 1, 102: 1, 103: 1, 104: 1  
2 fips                  0             1 FALSE        569 170: 12, 603: 10, 261: 9, 107: 8
3 zcta                  0             1 FALSE        842 475: 3, 110: 2, 160: 2, 290: 2  

-- Variable type: numeric -------------------------------------------------------------------------------------------
# A tibble: 44 x 11
   skim_variable               n_missing complete_rate          mean      sd            p0           p25
 * <chr>                           <int>         <dbl>         <dbl>   <dbl>         <dbl>         <dbl>
 1 value                               0             1         10.8  2.58e+0        3.02            9.27
 2 lat                                 0             1         38.5  4.62e+0       25.5            35.0 
 3 lon                                 0             1        -91.7  1.50e+1     -124.            -99.2 
 4 CMAQ                                0             1          8.41 2.97e+0        1.63            6.53
 5 zcta_area                           0             1  183173482.   5.43e+8    15459        14204602.  
 6 zcta_pop                            0             1      24228.   1.78e+4        0            9797   
 7 imp_a500                            0             1         24.7  1.93e+1        0               3.70
 8 imp_a1000                           0             1         24.3  1.80e+1        0               5.32
 9 imp_a5000                           0             1         19.9  1.47e+1        0.0534          6.79
10 imp_a10000                          0             1         15.8  1.38e+1        0.0942          4.54
11 imp_a15000                          0             1         13.4  1.31e+1        0.108           3.24
12 county_area                         0             1 3768701992.   6.21e+9 33703512      1116536298.  
13 county_pop                          0             1     687298.   1.29e+6      783          100948   
14 log_dist_to_prisec                  0             1          6.19 1.41e+0       -1.46            5.43
15 log_pri_length_5000                 0             1          9.82 1.08e+0        8.52            8.52
16 log_pri_length_10000                0             1         10.9  1.13e+0        9.21            9.80
17 log_pri_length_15000                0             1         11.5  1.15e+0        9.62           10.9 
18 log_pri_length_25000                0             1         12.2  1.10e+0       10.1            11.7 
19 log_prisec_length_500               0             1          6.99 9.53e-1        6.21            6.21
20 log_prisec_length_1000              0             1          8.56 7.90e-1        7.60            7.60
21 log_prisec_length_5000              0             1         11.3  7.81e-1        8.52           10.9 
22 log_prisec_length_10000             0             1         12.4  7.33e-1        9.21           12.0 
23 log_prisec_length_15000             0             1         13.0  7.23e-1        9.62           12.6 
24 log_prisec_length_25000             0             1         13.8  6.99e-1       10.1            13.4 
25 log_nei_2008_pm25_sum_10000         0             1          3.97 2.35e+0        0               2.15
26 log_nei_2008_pm25_sum_15000         0             1          4.72 2.25e+0        0               3.47
27 log_nei_2008_pm25_sum_25000         0             1          5.67 2.11e+0        0               4.66
28 log_nei_2008_pm10_sum_10000         0             1          4.35 2.32e+0        0               2.69
29 log_nei_2008_pm10_sum_15000         0             1          5.10 2.18e+0        0               3.87
30 log_nei_2008_pm10_sum_25000         0             1          6.07 2.01e+0        0               5.10
31 popdens_county                      0             1        552.   1.71e+3        0.263          40.8 
32 popdens_zcta                        0             1       1280.   2.76e+3        0             101.  
33 nohs                                0             1          6.99 7.21e+0        0               2.7 
34 somehs                              0             1         10.2  6.20e+0        0               5.9 
35 hs                                  0             1         30.3  1.14e+1        0              23.8 
36 somecollege                         0             1         21.6  8.60e+0        0              17.5 
37 associate                           0             1          7.13 4.01e+0        0               4.9 
38 bachelor                            0             1         14.9  9.71e+0        0               8.8 
39 grad                                0             1          8.91 8.65e+0        0               3.9 
40 pov                                 0             1         15.0  1.13e+1        0               6.5 
41 hs_orless                           0             1         47.5  1.68e+1        0              37.9 
42 urc2013                             0             1          2.92 1.52e+0        1               2   
43 urc2006                             0             1          2.97 1.52e+0        1               2   
44 aod                                 0             1         43.7  1.96e+1        5              31.7 
             p50           p75     p100 hist 
 *         <dbl>         <dbl>    <dbl> <chr>
 1         11.2          12.4   2.32e 1 ▂▆▇▁▁
 2         39.3          41.7   4.84e 1 ▁▃▅▇▂
 3        -87.5         -80.7  -6.80e 1 ▃▂▃▇▃
 4          8.62         10.2   2.31e 1 ▃▇▃▁▁
 5   37653560.    160041508.    8.16e 9 ▇▁▁▁▁
 6      22014         35005.    9.54e 4 ▇▇▃▁▁
 7         25.1          40.2   6.96e 1 ▇▅▆▃▂
 8         24.5          38.6   6.75e 1 ▇▅▆▃▁
 9         19.1          30.1   7.46e 1 ▇▆▃▁▁
10         12.4          24.2   7.21e 1 ▇▃▂▁▁
11          9.67         20.6   7.11e 1 ▇▃▁▁▁
12 1690826566.   2878192209     5.19e10 ▇▁▁▁▁
13     280730.       743159     9.82e 6 ▇▁▁▁▁
14          6.36          7.15  1.05e 1 ▁▁▃▇▁
15         10.1          10.7   1.20e 1 ▇▂▆▅▂
16         11.2          11.8   1.30e 1 ▇▂▇▇▃
17         11.7          12.4   1.36e 1 ▆▂▇▇▃
18         12.5          13.1   1.44e 1 ▅▃▇▇▃
19          6.21          7.82  9.40e 0 ▇▁▂▂▁
20          8.66          9.20  1.05e 1 ▇▅▆▃▁
21         11.4          11.8   1.28e 1 ▁▁▃▇▃
22         12.5          12.9   1.38e 1 ▁▁▃▇▅
23         13.1          13.6   1.44e 1 ▁▁▃▇▅
24         13.9          14.4   1.52e 1 ▁▁▃▇▆
25          4.29          5.69  9.12e 0 ▆▅▇▆▂
26          5.00          6.35  9.42e 0 ▃▃▇▇▂
27          5.91          7.28  9.65e 0 ▂▂▇▇▃
28          4.62          6.07  9.34e 0 ▅▅▇▇▂
29          5.39          6.72  9.71e 0 ▂▃▇▇▂
30          6.37          7.52  9.88e 0 ▁▂▆▇▃
31        157.          511.    2.68e 4 ▇▁▁▁▁
32        610.         1383.    3.04e 4 ▇▁▁▁▁
33          5.1           8.8   1   e 2 ▇▁▁▁▁
34          9.4          13.9   7.22e 1 ▇▂▁▁▁
35         30.8          36.1   1   e 2 ▂▇▂▁▁
36         21.3          24.7   1   e 2 ▆▇▁▁▁
37          7.1           8.8   7.14e 1 ▇▁▁▁▁
38         13.0          19.2   1   e 2 ▇▂▁▁▁
39          6.7          11     1   e 2 ▇▁▁▁▁
40         12.1          21.2   6.59e 1 ▇▅▂▁▁
41         48.7          59.1   1   e 2 ▁▃▇▃▁
42          3             4     6   e 0 ▇▅▃▂▁
43          3             4     6   e 0 ▇▅▃▂▁
44         40.2          49.7   1.43e 2 ▃▇▁▁▁
PM_cor <- cor(pm %>% dplyr::select_if(is.numeric))
corrplot::corrplot(PM_cor, tl.cex = 0.5)

split the data

tidymodels

set.seed(1234)
pm_split <- rsample::initial_split(data = pm, prop = 2/3)
pm_split
<Analysis/Assess/Total>
<584/292/876>

Importantly the initial_split() function only determines what rows of our pm dataframe should be assigned for training or testing, it does not actually split the data.

To extract the testing and training data we can use the training() and testing() functions also of the rsample package.

train_pm <-rsample::training(pm_split)
test_pm <-rsample::testing(pm_split)

recipe

https://recipes.tidymodels.org/reference/index.html

# library(tidymodels)
simple_rec <- train_pm %>%
  recipes::recipe(value ~ .)

simple_rec
Data Recipe

Inputs:

update role

of id which is not a predictor

simple_rec <- train_pm %>%
  recipes::recipe(value ~ .) %>%
  recipes::update_role(id, new_role = "id variable")

simple_rec
Data Recipe

Inputs:

steps

https://recipes.tidymodels.org/reference/index.html

All of the step functions look like step_() with the replaced with a name, except for the check functions which look like check_*().

There are several ways to select what variables to apply steps to:

  1. Using tidyselect methods: contains(), matches(), starts_with(), ends_with(), everything(), num_range()
  2. Using the type: all_nominal(), all_numeric() , has_type()
  3. Using the role: all_predictors(), all_outcomes(), has_role()
  4. Using the name - use the actual name of the variable/variables of interest

We want to dummy encode our categorical variables so that they are numeric as we plan to use a linear regression for our model.

We will use the one-hot encoding means that we do not simply encode our categorical variables numerically, as our numeric assignments can be interpreted by algorithms as having a particular rank or order. Instead, binary variables made of 1s and 0s are used to arbitrarily assign a numeric value that has no apparent order.

simple_rec %>%
  step_dummy(state, county, city, zcta, one_hot = TRUE)
Data Recipe

Inputs:

Operations:

Dummy variables from state, county, city, zcta
simple_rec %>%
  update_role("fips", new_role = "county id")
Data Recipe

Inputs:
step_corr - remove redundant correlations

We also want to remove variables that appear to be redundant and are highly correlated with others, as we know from our exploratory data analysis that many of our variables are correlated with one another. We can do this using the step_corr() function.

simple_rec %>%
  step_corr(all_predictors(), - CMAQ, - aod)
Data Recipe

Inputs:

Operations:

Correlation filter on all_predictors(), -CMAQ, -aod
step_nzv – remove variables with near-zero variance
simple_rec %>%
  step_nzv(all_predictors(), - CMAQ, - aod)
Data Recipe

Inputs:

Operations:

Sparse, unbalanced variable filter on all_predictors(), -CMAQ, -aod
Putting it all together
simple_rec <- train_pm %>%
  recipes::recipe(value ~ .) %>%
  recipes::update_role(id, new_role = "id variable") %>%
  update_role("fips", new_role = "county id") %>%
  step_dummy(state, county, city, zcta, one_hot = TRUE) %>%
  step_corr(all_predictors(), - CMAQ, - aod)%>%
  step_nzv(all_predictors(), - CMAQ, - aod)
  
simple_rec
Data Recipe

Inputs:

Operations:

Dummy variables from state, county, city, zcta
Correlation filter on all_predictors(), -CMAQ, -aod
Sparse, unbalanced variable filter on all_predictors(), -CMAQ, -aod

Preprocessing

prepped_rec <- prep(simple_rec, verbose = TRUE, retain = TRUE)
oper 1 step dummy [training] 
oper 2 step corr [training] 
Warning in cor(x, use = use, method = method) :
  the standard deviation is zero
Warning: The correlation matrix has missing values. 273 columns were excluded from the filter.
oper 3 step nzv [training] 
The retained training set is ~ 0.26 Mb  in memory.
bake

we retained our preprocessed training data (i.e. prep(retain=TRUE)), we can take a look at it by using the bake() function of the recipes package

requires the new_data = NULL argument when we are using the training data.

preproc_train <- bake(prepped_rec, new_data = NULL)
glimpse(preproc_train)
Rows: 584
Columns: 37
$ id                          <fct> 18003.0004, 55041.0007, 6065.1003, 39009.0003, 39061.8001, 24510.0006, 6061.000~
$ fips                        <fct> 18003, 55041, 6065, 39009, 39061, 24510, 6061, 6065, 44003, 37111, 19113, 6031,~
$ lat                         <dbl> 41.09497, 45.56300, 33.94603, 39.44217, 39.18053, 39.34056, 38.74573, 33.85275,~
$ lon                         <dbl> -85.10182, -88.80880, -117.40063, -81.90883, -84.49215, -76.58222, -121.26631, ~
$ CMAQ                        <dbl> 10.383231, 3.411247, 11.404085, 7.971165, 11.791236, 10.940985, 8.814368, 7.719~
$ zcta_area                   <dbl> 16696709, 370280916, 41957182, 132383592, 5358625, 461424, 28131203, 94913381, ~
$ zcta_pop                    <dbl> 21306, 4141, 44001, 1115, 6566, 934, 41192, 26179, 6135, 30600, 40149, 26079, 3~
$ imp_a500                    <dbl> 28.97837370, 0.00000000, 30.39013841, 0.00000000, 51.26198934, 23.86332180, 31.~
$ imp_a15000                  <dbl> 13.0547959, 0.3676404, 23.7457506, 0.3307975, 21.7174564, 24.0009267, 17.909259~
$ county_area                 <dbl> 1702419942, 2626421270, 18664696661, 1304313482, 1051302780, 209643241, 3644136~
$ county_pop                  <dbl> 355329, 9304, 2189641, 64757, 802374, 620961, 348432, 2189641, 166158, 44996, 2~
$ log_dist_to_prisec          <dbl> 6.621891, 8.415468, 7.419762, 6.344681, 5.778350, 6.290041, 7.018836, 6.838720,~
$ log_pri_length_5000         <dbl> 8.517193, 8.517193, 10.150514, 8.517193, 9.956985, 10.300814, 10.288845, 9.4423~
$ log_pri_length_25000        <dbl> 12.77378, 10.16440, 13.14450, 10.12663, 13.02123, 13.68658, 12.19710, 11.33494,~
$ log_prisec_length_500       <dbl> 6.214608, 6.214608, 6.214608, 6.214608, 7.141268, 6.214608, 6.214608, 6.214608,~
$ log_prisec_length_1000      <dbl> 9.240294, 7.600902, 7.600902, 8.793450, 8.407702, 9.451706, 7.600902, 8.094200,~
$ log_prisec_length_5000      <dbl> 11.485093, 9.425537, 10.155961, 10.562382, 11.698608, 12.086627, 10.288845, 10.~
$ log_prisec_length_10000     <dbl> 12.75582, 11.44833, 11.59563, 11.69093, 12.95940, 13.60081, 11.04273, 11.16916,~
$ log_nei_2008_pm10_sum_10000 <dbl> 4.91110140, 3.86982666, 4.03184660, 0.00000000, 5.89930186, 5.67208039, 4.20670~
$ log_nei_2008_pm10_sum_15000 <dbl> 5.399131255, 3.883688842, 5.459257126, 0.000000000, 6.294167687, 6.365952312, 4~
$ log_nei_2008_pm10_sum_25000 <dbl> 5.8160473, 3.8872637, 6.8845369, 3.7656347, 6.7347561, 8.6395685, 5.7761975, 3.~
$ popdens_county              <dbl> 208.719947, 3.542463, 117.314577, 49.648341, 763.218756, 2961.989125, 95.614433~
$ popdens_zcta                <dbl> 1276.059851, 11.183401, 1048.711994, 8.422494, 1225.314330, 2024.168660, 1464.2~
$ nohs                        <dbl> 4.3, 5.1, 3.7, 4.8, 2.1, 0.0, 2.5, 7.7, 0.8, 8.6, 1.4, 19.2, 8.9, 6.4, 8.3, 2.2~
$ somehs                      <dbl> 6.7, 10.4, 5.9, 11.5, 10.5, 0.0, 4.3, 7.5, 4.0, 13.9, 4.8, 25.4, 26.4, 9.5, 11.~
$ hs                          <dbl> 31.7, 40.3, 17.9, 47.3, 30.0, 0.0, 17.8, 21.7, 33.4, 33.4, 25.3, 26.5, 31.6, 26~
$ somecollege                 <dbl> 27.2, 24.1, 26.3, 20.0, 27.1, 0.0, 26.1, 26.0, 19.2, 18.1, 22.4, 20.0, 28.4, 26~
$ associate                   <dbl> 8.2, 7.4, 8.3, 3.1, 8.5, 71.4, 13.2, 7.6, 11.8, 11.0, 10.3, 5.4, 3.6, 4.3, 9.7,~
$ bachelor                    <dbl> 15.0, 8.6, 20.2, 9.8, 14.2, 0.0, 23.4, 17.2, 20.9, 10.2, 25.2, 2.3, 1.1, 19.1, ~
$ grad                        <dbl> 6.8, 4.2, 17.7, 3.5, 7.6, 28.6, 12.6, 12.3, 9.9, 4.8, 10.5, 1.1, 0.0, 8.4, 5.9,~
$ pov                         <dbl> 13.500, 18.900, 6.700, 14.400, 12.500, 3.525, 5.900, 10.300, 5.300, 14.000, 5.8~
$ hs_orless                   <dbl> 42.7, 55.8, 27.5, 63.6, 42.6, 0.0, 24.6, 36.9, 38.2, 55.9, 31.5, 71.1, 66.9, 42~
$ urc2006                     <dbl> 3, 6, 1, 5, 1, 1, 2, 1, 2, 6, 4, 4, 4, 4, 4, 2, 5, 3, 5, 4, 3, 5, 5, 1, 1, 1, 3~
$ aod                         <dbl> 54.11111, 31.16667, 83.12500, 33.36364, 50.80000, 53.50000, 65.09091, 18.14286,~
$ value                       <dbl> 11.699065, 6.956780, 13.289744, 10.742000, 14.454783, 12.204167, 11.192063, 6.9~
$ state_California            <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
$ city_Not.in.a.city          <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
extracting
baked_test_pm <- recipes::bake(prepped_rec, new_data = test_pm)
Warning: There are new levels in a factor: Colbert, Etowah, Russell, Walker, Cochise, Coconino, Mohave, Arkansas, Crittenden, Garland, Phillips, Pope, Calaveras, Humboldt, Inyo, Merced, Monterey, San Benito, Sonoma, Tulare, Arapahoe, Elbert, Larimer, Mesa, Brevard, Leon, Sarasota, Paulding, Ada, Benewah, Lemhi, Shoshone, DuPage, McHenry, Winnebago, Elkhart, St. Joseph, Tippecanoe, Vanderburgh, Black Hawk, Pottawattamie, Van Buren, Wright, Boyd, Christian, Daviess, Hardin, Kenton, Pike, Calcasieu, St. Bernard, Tangipahoa, Baltimore, Cecil, Bristol, Berrien, Missaukee, Muskegon, Oakland, Dakota, Olmsted, Grenada, Lauderdale, Cascade, Sanders, Silver Bow, Scotts Bluff, Merrimack, Rockingham, Atlantic, Gloucester, Lea, San Juan, Santa Fe, Onondaga, Steuben, Westchester, Martin, Pitt, Swain, Watauga, Billings, Lorain, Muskogee, Sequoyah, Deschutes, Multnomah, Umatilla, Berks, Bucks, Cambria, Centre, Lackawanna, Northampton, York, Edgefield, Florence, Pennington, Bowie, Ector, Cache, Frederick, Brist [... truncated]
Warning: There are new levels in a factor: Muscle Shoals, Gadsden, Dothan, Chickasaw, Montgomery, Phenix City, Douglas, Flagstaff, Peach Springs, Apache Junction, Stuttgart, Hot Springs (Hot Springs National Park), Newport, Russellville, Springdale, San Andreas, Eureka, El Centro, Keeler, Lakeport, Azusa, Burbank, Merced, Salinas, Grass Valley, Anaheim, Quincy, Hollister, Ontario, Victorville, San Bernardino, Chula Vista, San Diego, San Jose, Live Oak, Santa Rosa, Visalia, Piru, Simi Valley, Littleton, Fort Collins, Grand Junction, Bridgeport, Dover, Wilmington, Melbourne, Tampa, Tallahassee, Palm Springs North, Orlando, Belle Glade, Ridge Wood Heights, Savannah, Kennesaw, Doraville, Boise (corporate name Boise City), Pinehurst (Pine Creek), Champaign, Summit, Naperville, Elgin, Zion, Cary, Alton, Braidwood, Rockford, Elkhart, Anderson, South Bend, Evansville, Waterloo, Keokuk, Council Bluffs, Clarion, Frankfort, Elizabethtown, Covington, Richmond, Pikeville, Vinton, Lake Charles, Chalmette, Co [... truncated]
glimpse(baked_test_pm)
Rows: 292
Columns: 37
$ id                          <fct> 1033.1002, 1055.001, 1069.0003, 1073.0023, 1073.1005, 1073.1009, 1073.5003, 109~
$ fips                        <fct> 1033, 1055, 1069, 1073, 1073, 1073, 1073, 1097, 1101, 1113, 1127, 4003, 4005, 4~
$ lat                         <dbl> 34.75878, 33.99375, 31.22636, 33.55306, 33.33111, 33.45972, 33.80167, 30.76994,~
$ lon                         <dbl> -87.65056, -85.99107, -85.39077, -86.81500, -87.00361, -87.30556, -86.94250, -8~
$ CMAQ                        <dbl> 9.402679, 9.241744, 9.121892, 10.235612, 10.235612, 8.161789, 8.816101, 8.65489~
$ zcta_area                   <dbl> 16716984, 154069359, 162685124, 26929603, 166239542, 385566685, 230081840, 1230~
$ zcta_pop                    <dbl> 9042, 20045, 30217, 9010, 16140, 3699, 13794, 6106, 12997, 8638, 10424, 1021, 6~
$ imp_a500                    <dbl> 19.17301038, 16.49307958, 19.13927336, 41.83304498, 1.69765343, 0.00000000, 0.4~
$ imp_a15000                  <dbl> 5.2472094, 5.1612102, 4.7401296, 17.4524844, 4.3006226, 0.1623359, 0.9481135, 7~
$ county_area                 <dbl> 1534877333, 1385618994, 1501737720, 2878192209, 2878192209, 3423328940, 2878192~
$ county_pop                  <dbl> 54428, 104430, 101547, 658466, 658466, 194656, 658466, 412992, 229363, 189885, ~
$ log_dist_to_prisec          <dbl> 5.760131, 5.261457, 7.112373, 6.600958, 6.526896, 9.789557, 8.734821, 5.779170,~
$ log_pri_length_5000         <dbl> 8.517193, 9.066563, 8.517193, 11.156977, 10.915066, 8.517193, 8.517193, 10.3642~
$ log_pri_length_25000        <dbl> 10.15769, 12.01356, 10.12663, 12.98762, 12.86472, 10.12663, 11.85793, 12.23758,~
$ log_prisec_length_500       <dbl> 8.611945, 8.740680, 6.214608, 6.214608, 6.214608, 6.214608, 6.214608, 7.595229,~
$ log_prisec_length_1000      <dbl> 9.735569, 9.627898, 7.600902, 9.075921, 8.949239, 7.600902, 7.600902, 8.658654,~
$ log_prisec_length_5000      <dbl> 11.770407, 11.728889, 12.298627, 12.281645, 11.039001, 8.517193, 8.517193, 11.7~
$ log_prisec_length_10000     <dbl> 12.840663, 12.768279, 12.994141, 13.278416, 12.128956, 9.210340, 10.420770, 12.~
$ log_nei_2008_pm10_sum_10000 <dbl> 6.69187313, 4.43719884, 0.92888890, 8.20975958, 5.77118113, 0.01654963, 0.00000~
$ log_nei_2008_pm10_sum_15000 <dbl> 6.70127741, 4.46267932, 3.67473904, 8.64883455, 6.86889676, 0.01654963, 4.01669~
$ log_nei_2008_pm10_sum_25000 <dbl> 7.148858, 4.678311, 3.744629, 8.858019, 8.021072, 6.891992, 6.961580, 6.640326,~
$ popdens_county              <dbl> 35.4608142, 75.3670384, 67.6196640, 228.7776327, 228.7776327, 56.8616114, 228.7~
$ popdens_zcta                <dbl> 540.8870404, 130.1037411, 185.7391706, 334.5760426, 97.0888142, 9.5936712, 59.9~
$ nohs                        <dbl> 7.3, 4.3, 5.8, 7.1, 2.7, 11.1, 9.7, 3.0, 8.8, 8.6, 5.7, 32.7, 1.7, 23.9, 8.6, 3~
$ somehs                      <dbl> 15.8, 13.3, 11.6, 17.1, 6.6, 11.6, 21.6, 17.1, 19.3, 24.6, 14.3, 11.8, 0.0, 17.~
$ hs                          <dbl> 30.6, 27.8, 29.8, 37.2, 30.7, 46.0, 39.3, 39.0, 36.0, 27.7, 37.6, 45.0, 22.3, 2~
$ somecollege                 <dbl> 20.9, 29.2, 21.4, 23.5, 25.7, 17.2, 21.6, 22.4, 20.2, 19.4, 21.3, 7.9, 36.4, 20~
$ associate                   <dbl> 7.6, 10.1, 7.9, 7.3, 8.0, 4.1, 5.2, 6.6, 4.6, 4.3, 7.3, 2.6, 14.7, 4.3, 4.9, 10~
$ bachelor                    <dbl> 12.7, 10.0, 13.7, 5.9, 17.6, 7.1, 2.2, 7.8, 7.6, 9.5, 10.3, 0.0, 13.6, 4.2, 2.7~
$ grad                        <dbl> 5.1, 5.4, 9.8, 2.0, 8.7, 2.9, 0.4, 4.2, 3.5, 5.9, 3.5, 0.0, 11.4, 2.0, 0.6, 5.4~
$ pov                         <dbl> 19.0, 8.8, 15.6, 25.5, 7.3, 8.1, 13.3, 23.2, 23.0, 37.7, 19.3, 15.6, 64.3, 29.5~
$ hs_orless                   <dbl> 53.7, 45.4, 47.2, 61.4, 40.0, 68.7, 70.6, 59.1, 64.1, 60.9, 57.6, 89.5, 24.0, 6~
$ urc2006                     <dbl> 4, 4, 4, 1, 1, 1, 2, 3, 3, 3, 2, 5, 4, 1, 5, 2, 6, 2, 4, 6, 5, 5, 3, 3, 6, 3, 5~
$ aod                         <dbl> 36.000000, 43.416667, 33.000000, 39.583333, 38.750000, 40.363636, 42.636364, 42~
$ value                       <dbl> 11.212174, 12.375394, 10.508850, 15.591017, 12.399355, 11.103704, 11.775000, 10~
$ state_California            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1~
$ city_Not.in.a.city          <dbl> NA, NA, NA, 0, 1, 1, 1, NA, NA, NA, 0, NA, NA, 0, NA, NA, NA, 0, NA, NA, 0, NA,~
examine
traincities <- train_pm %>% distinct(city)
testcities <- test_pm %>% distinct(city)

#get the number of cities that were different
dim(dplyr::setdiff(traincities, testcities))
[1] 381   1
#get the number of cities that overlapped
dim(dplyr::intersect(traincities, testcities))
[1] 55  1

So, let’s go back to our pm dataset and modify the city variable to just be values of in a city or not in a city using the case_when() function of dplyr. This function allows you to vectorize multiple if_else() statements.

pm %>% #count(city, sort = TRUE)
  mutate(city = case_when(city == "Not in a city" ~ "Not in a city",
                          city != "Not in a city" ~ "In a city")) %>% 
  select(state, county, city)
redo
pm <- pm %>% 
  mutate(city = case_when(city == "Not in a city" ~ "Not in a city",
                          city != "Not in a city" ~ "In a city"))

set.seed(1234) # same seed as before
pm_split <-rsample::initial_split(data = pm, prop = 2/3)
pm_split
<Analysis/Assess/Total>
<584/292/876>
 train_pm <-rsample::training(pm_split)
 test_pm <-rsample::testing(pm_split)
novel_rec <-train_pm %>%
    recipe() %>%
    update_role(everything(), new_role = "predictor") %>%
    update_role(value, new_role = "outcome") %>%
    update_role(id, new_role = "id variable") %>%
    update_role("fips", new_role = "county id") %>%
    step_dummy(state, county, city, zcta, one_hot = TRUE) %>%
    step_corr(all_numeric()) %>%
    step_nzv(all_numeric()) 

Now we will check the preprocessed data again to see if we still have NA values

prepped_rec <- prep(novel_rec, verbose = TRUE, retain = TRUE)
oper 1 step dummy [training] 
oper 2 step corr [training] 
Warning in cor(x, use = use, method = method) :
  the standard deviation is zero
Warning: The correlation matrix has missing values. 273 columns were excluded from the filter.
oper 3 step nzv [training] 
The retained training set is ~ 0.27 Mb  in memory.
preproc_train <- bake(prepped_rec, new_data = NULL)
glimpse(preproc_train)
Rows: 584
Columns: 38
$ id                          <fct> 18003.0004, 55041.0007, 6065.1003, 39009.0003, 39061.8001, 24510.0006, 6061.000~
$ value                       <dbl> 11.699065, 6.956780, 13.289744, 10.742000, 14.454783, 12.204167, 11.192063, 6.9~
$ fips                        <fct> 18003, 55041, 6065, 39009, 39061, 24510, 6061, 6065, 44003, 37111, 19113, 6031,~
$ lat                         <dbl> 41.09497, 45.56300, 33.94603, 39.44217, 39.18053, 39.34056, 38.74573, 33.85275,~
$ lon                         <dbl> -85.10182, -88.80880, -117.40063, -81.90883, -84.49215, -76.58222, -121.26631, ~
$ CMAQ                        <dbl> 10.383231, 3.411247, 11.404085, 7.971165, 11.791236, 10.940985, 8.814368, 7.719~
$ zcta_area                   <dbl> 16696709, 370280916, 41957182, 132383592, 5358625, 461424, 28131203, 94913381, ~
$ zcta_pop                    <dbl> 21306, 4141, 44001, 1115, 6566, 934, 41192, 26179, 6135, 30600, 40149, 26079, 3~
$ imp_a500                    <dbl> 28.97837370, 0.00000000, 30.39013841, 0.00000000, 51.26198934, 23.86332180, 31.~
$ imp_a15000                  <dbl> 13.0547959, 0.3676404, 23.7457506, 0.3307975, 21.7174564, 24.0009267, 17.909259~
$ county_area                 <dbl> 1702419942, 2626421270, 18664696661, 1304313482, 1051302780, 209643241, 3644136~
$ county_pop                  <dbl> 355329, 9304, 2189641, 64757, 802374, 620961, 348432, 2189641, 166158, 44996, 2~
$ log_dist_to_prisec          <dbl> 6.621891, 8.415468, 7.419762, 6.344681, 5.778350, 6.290041, 7.018836, 6.838720,~
$ log_pri_length_5000         <dbl> 8.517193, 8.517193, 10.150514, 8.517193, 9.956985, 10.300814, 10.288845, 9.4423~
$ log_pri_length_25000        <dbl> 12.77378, 10.16440, 13.14450, 10.12663, 13.02123, 13.68658, 12.19710, 11.33494,~
$ log_prisec_length_500       <dbl> 6.214608, 6.214608, 6.214608, 6.214608, 7.141268, 6.214608, 6.214608, 6.214608,~
$ log_prisec_length_1000      <dbl> 9.240294, 7.600902, 7.600902, 8.793450, 8.407702, 9.451706, 7.600902, 8.094200,~
$ log_prisec_length_5000      <dbl> 11.485093, 9.425537, 10.155961, 10.562382, 11.698608, 12.086627, 10.288845, 10.~
$ log_prisec_length_10000     <dbl> 12.75582, 11.44833, 11.59563, 11.69093, 12.95940, 13.60081, 11.04273, 11.16916,~
$ log_prisec_length_25000     <dbl> 13.98749, 13.15082, 13.44293, 13.58697, 14.59757, 14.87934, 12.99864, 12.53690,~
$ log_nei_2008_pm10_sum_10000 <dbl> 4.91110140, 3.86982666, 4.03184660, 0.00000000, 5.89930186, 5.67208039, 4.20670~
$ log_nei_2008_pm10_sum_15000 <dbl> 5.399131255, 3.883688842, 5.459257126, 0.000000000, 6.294167687, 6.365952312, 4~
$ log_nei_2008_pm10_sum_25000 <dbl> 5.8160473, 3.8872637, 6.8845369, 3.7656347, 6.7347561, 8.6395685, 5.7761975, 3.~
$ popdens_county              <dbl> 208.719947, 3.542463, 117.314577, 49.648341, 763.218756, 2961.989125, 95.614433~
$ popdens_zcta                <dbl> 1276.059851, 11.183401, 1048.711994, 8.422494, 1225.314330, 2024.168660, 1464.2~
$ nohs                        <dbl> 4.3, 5.1, 3.7, 4.8, 2.1, 0.0, 2.5, 7.7, 0.8, 8.6, 1.4, 19.2, 8.9, 6.4, 8.3, 2.2~
$ somehs                      <dbl> 6.7, 10.4, 5.9, 11.5, 10.5, 0.0, 4.3, 7.5, 4.0, 13.9, 4.8, 25.4, 26.4, 9.5, 11.~
$ hs                          <dbl> 31.7, 40.3, 17.9, 47.3, 30.0, 0.0, 17.8, 21.7, 33.4, 33.4, 25.3, 26.5, 31.6, 26~
$ somecollege                 <dbl> 27.2, 24.1, 26.3, 20.0, 27.1, 0.0, 26.1, 26.0, 19.2, 18.1, 22.4, 20.0, 28.4, 26~
$ associate                   <dbl> 8.2, 7.4, 8.3, 3.1, 8.5, 71.4, 13.2, 7.6, 11.8, 11.0, 10.3, 5.4, 3.6, 4.3, 9.7,~
$ bachelor                    <dbl> 15.0, 8.6, 20.2, 9.8, 14.2, 0.0, 23.4, 17.2, 20.9, 10.2, 25.2, 2.3, 1.1, 19.1, ~
$ grad                        <dbl> 6.8, 4.2, 17.7, 3.5, 7.6, 28.6, 12.6, 12.3, 9.9, 4.8, 10.5, 1.1, 0.0, 8.4, 5.9,~
$ pov                         <dbl> 13.500, 18.900, 6.700, 14.400, 12.500, 3.525, 5.900, 10.300, 5.300, 14.000, 5.8~
$ hs_orless                   <dbl> 42.7, 55.8, 27.5, 63.6, 42.6, 0.0, 24.6, 36.9, 38.2, 55.9, 31.5, 71.1, 66.9, 42~
$ urc2006                     <dbl> 3, 6, 1, 5, 1, 1, 2, 1, 2, 6, 4, 4, 4, 4, 4, 2, 5, 3, 5, 4, 3, 5, 5, 1, 1, 1, 3~
$ aod                         <dbl> 54.11111, 31.16667, 83.12500, 33.36364, 50.80000, 53.50000, 65.09091, 18.14286,~
$ state_California            <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
$ city_Not.in.a.city          <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
baked_test_pm <- recipes::bake(prepped_rec, new_data = test_pm)
Warning: There are new levels in a factor: Colbert, Etowah, Russell, Walker, Cochise, Coconino, Mohave, Arkansas, Crittenden, Garland, Phillips, Pope, Calaveras, Humboldt, Inyo, Merced, Monterey, San Benito, Sonoma, Tulare, Arapahoe, Elbert, Larimer, Mesa, Brevard, Leon, Sarasota, Paulding, Ada, Benewah, Lemhi, Shoshone, DuPage, McHenry, Winnebago, Elkhart, St. Joseph, Tippecanoe, Vanderburgh, Black Hawk, Pottawattamie, Van Buren, Wright, Boyd, Christian, Daviess, Hardin, Kenton, Pike, Calcasieu, St. Bernard, Tangipahoa, Baltimore, Cecil, Bristol, Berrien, Missaukee, Muskegon, Oakland, Dakota, Olmsted, Grenada, Lauderdale, Cascade, Sanders, Silver Bow, Scotts Bluff, Merrimack, Rockingham, Atlantic, Gloucester, Lea, San Juan, Santa Fe, Onondaga, Steuben, Westchester, Martin, Pitt, Swain, Watauga, Billings, Lorain, Muskogee, Sequoyah, Deschutes, Multnomah, Umatilla, Berks, Bucks, Cambria, Centre, Lackawanna, Northampton, York, Edgefield, Florence, Pennington, Bowie, Ector, Cache, Frederick, Brist [... truncated]
glimpse(baked_test_pm)
Rows: 292
Columns: 38
$ id                          <fct> 1033.1002, 1055.001, 1069.0003, 1073.0023, 1073.1005, 1073.1009, 1073.5003, 109~
$ value                       <dbl> 11.212174, 12.375394, 10.508850, 15.591017, 12.399355, 11.103704, 11.775000, 10~
$ fips                        <fct> 1033, 1055, 1069, 1073, 1073, 1073, 1073, 1097, 1101, 1113, 1127, 4003, 4005, 4~
$ lat                         <dbl> 34.75878, 33.99375, 31.22636, 33.55306, 33.33111, 33.45972, 33.80167, 30.76994,~
$ lon                         <dbl> -87.65056, -85.99107, -85.39077, -86.81500, -87.00361, -87.30556, -86.94250, -8~
$ CMAQ                        <dbl> 9.402679, 9.241744, 9.121892, 10.235612, 10.235612, 8.161789, 8.816101, 8.65489~
$ zcta_area                   <dbl> 16716984, 154069359, 162685124, 26929603, 166239542, 385566685, 230081840, 1230~
$ zcta_pop                    <dbl> 9042, 20045, 30217, 9010, 16140, 3699, 13794, 6106, 12997, 8638, 10424, 1021, 6~
$ imp_a500                    <dbl> 19.17301038, 16.49307958, 19.13927336, 41.83304498, 1.69765343, 0.00000000, 0.4~
$ imp_a15000                  <dbl> 5.2472094, 5.1612102, 4.7401296, 17.4524844, 4.3006226, 0.1623359, 0.9481135, 7~
$ county_area                 <dbl> 1534877333, 1385618994, 1501737720, 2878192209, 2878192209, 3423328940, 2878192~
$ county_pop                  <dbl> 54428, 104430, 101547, 658466, 658466, 194656, 658466, 412992, 229363, 189885, ~
$ log_dist_to_prisec          <dbl> 5.760131, 5.261457, 7.112373, 6.600958, 6.526896, 9.789557, 8.734821, 5.779170,~
$ log_pri_length_5000         <dbl> 8.517193, 9.066563, 8.517193, 11.156977, 10.915066, 8.517193, 8.517193, 10.3642~
$ log_pri_length_25000        <dbl> 10.15769, 12.01356, 10.12663, 12.98762, 12.86472, 10.12663, 11.85793, 12.23758,~
$ log_prisec_length_500       <dbl> 8.611945, 8.740680, 6.214608, 6.214608, 6.214608, 6.214608, 6.214608, 7.595229,~
$ log_prisec_length_1000      <dbl> 9.735569, 9.627898, 7.600902, 9.075921, 8.949239, 7.600902, 7.600902, 8.658654,~
$ log_prisec_length_5000      <dbl> 11.770407, 11.728889, 12.298627, 12.281645, 11.039001, 8.517193, 8.517193, 11.7~
$ log_prisec_length_10000     <dbl> 12.840663, 12.768279, 12.994141, 13.278416, 12.128956, 9.210340, 10.420770, 12.~
$ log_prisec_length_25000     <dbl> 13.79973, 13.70026, 13.85550, 14.45221, 13.80017, 11.89377, 13.34258, 13.90072,~
$ log_nei_2008_pm10_sum_10000 <dbl> 6.69187313, 4.43719884, 0.92888890, 8.20975958, 5.77118113, 0.01654963, 0.00000~
$ log_nei_2008_pm10_sum_15000 <dbl> 6.70127741, 4.46267932, 3.67473904, 8.64883455, 6.86889676, 0.01654963, 4.01669~
$ log_nei_2008_pm10_sum_25000 <dbl> 7.148858, 4.678311, 3.744629, 8.858019, 8.021072, 6.891992, 6.961580, 6.640326,~
$ popdens_county              <dbl> 35.4608142, 75.3670384, 67.6196640, 228.7776327, 228.7776327, 56.8616114, 228.7~
$ popdens_zcta                <dbl> 540.8870404, 130.1037411, 185.7391706, 334.5760426, 97.0888142, 9.5936712, 59.9~
$ nohs                        <dbl> 7.3, 4.3, 5.8, 7.1, 2.7, 11.1, 9.7, 3.0, 8.8, 8.6, 5.7, 32.7, 1.7, 23.9, 8.6, 3~
$ somehs                      <dbl> 15.8, 13.3, 11.6, 17.1, 6.6, 11.6, 21.6, 17.1, 19.3, 24.6, 14.3, 11.8, 0.0, 17.~
$ hs                          <dbl> 30.6, 27.8, 29.8, 37.2, 30.7, 46.0, 39.3, 39.0, 36.0, 27.7, 37.6, 45.0, 22.3, 2~
$ somecollege                 <dbl> 20.9, 29.2, 21.4, 23.5, 25.7, 17.2, 21.6, 22.4, 20.2, 19.4, 21.3, 7.9, 36.4, 20~
$ associate                   <dbl> 7.6, 10.1, 7.9, 7.3, 8.0, 4.1, 5.2, 6.6, 4.6, 4.3, 7.3, 2.6, 14.7, 4.3, 4.9, 10~
$ bachelor                    <dbl> 12.7, 10.0, 13.7, 5.9, 17.6, 7.1, 2.2, 7.8, 7.6, 9.5, 10.3, 0.0, 13.6, 4.2, 2.7~
$ grad                        <dbl> 5.1, 5.4, 9.8, 2.0, 8.7, 2.9, 0.4, 4.2, 3.5, 5.9, 3.5, 0.0, 11.4, 2.0, 0.6, 5.4~
$ pov                         <dbl> 19.0, 8.8, 15.6, 25.5, 7.3, 8.1, 13.3, 23.2, 23.0, 37.7, 19.3, 15.6, 64.3, 29.5~
$ hs_orless                   <dbl> 53.7, 45.4, 47.2, 61.4, 40.0, 68.7, 70.6, 59.1, 64.1, 60.9, 57.6, 89.5, 24.0, 6~
$ urc2006                     <dbl> 4, 4, 4, 1, 1, 1, 2, 3, 3, 3, 2, 5, 4, 1, 5, 2, 6, 2, 4, 6, 5, 5, 3, 3, 6, 3, 5~
$ aod                         <dbl> 36.000000, 43.416667, 33.000000, 39.583333, 38.750000, 40.363636, 42.636364, 42~
$ state_California            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1~
$ city_Not.in.a.city          <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~

Great, now we no longer have NA values!

specify the model

For our case, we are going to start our analysis with a linear regression but we will demonstrate how we can try different models.

PM_model <- parsnip::linear_reg() # PM was used in the name for particulate matter
PM_model
Linear Regression Model Specification (regression)

OK. So far, all we have defined is that we want to use a linear regression… Let’s tell parsnip more about what we want.

lm_PM_model <- 
  PM_model  %>%
  parsnip::set_engine("lm")

lm_PM_model
Linear Regression Model Specification (regression)

Computational engine: lm 

Here, we aim to predict the air pollution. You can do this with the set_mode() function of the parsnip package, by using either set_mode(“classification”) or set_mode(“regression”).

lm_PM_model <- 
  PM_model  %>%
  parsnip::set_engine("lm") %>%
  set_mode("regression")

lm_PM_model
Linear Regression Model Specification (regression)

Computational engine: lm 
PM_wflow <-workflows::workflow() %>%
           workflows::add_recipe(novel_rec) %>%
           workflows::add_model(lm_PM_model)
PM_wflow
== Workflow =========================================================================================================
Preprocessor: Recipe
Model: linear_reg()

-- Preprocessor -----------------------------------------------------------------------------------------------------
3 Recipe Steps

* step_dummy()
* step_corr()
* step_nzv()

-- Model ------------------------------------------------------------------------------------------------------------
Linear Regression Model Specification (regression)

Computational engine: lm 
PM_wflow_fit <- parsnip::fit(PM_wflow, data = train_pm)
Warning in cor(x, use = use, method = method) :
  the standard deviation is zero
Warning: The correlation matrix has missing values. 273 columns were excluded from the filter.
PM_wflow_fit
== Workflow [trained] ===============================================================================================
Preprocessor: Recipe
Model: linear_reg()

-- Preprocessor -----------------------------------------------------------------------------------------------------
3 Recipe Steps

* step_dummy()
* step_corr()
* step_nzv()

-- Model ------------------------------------------------------------------------------------------------------------

Call:
stats::lm(formula = ..y ~ ., data = data)

Coefficients:
                (Intercept)                          lat                          lon                         CMAQ  
                  2.936e+02                    3.261e-02                    1.586e-02                    2.463e-01  
                  zcta_area                     zcta_pop                     imp_a500                   imp_a15000  
                 -3.433e-10                    1.013e-05                    5.064e-03                   -3.066e-03  
                county_area                   county_pop           log_dist_to_prisec          log_pri_length_5000  
                 -2.324e-11                   -7.576e-08                    6.214e-02                   -2.006e-01  
       log_pri_length_25000        log_prisec_length_500       log_prisec_length_1000       log_prisec_length_5000  
                 -5.411e-02                    2.204e-01                    1.154e-01                    2.374e-01  
    log_prisec_length_10000      log_prisec_length_25000  log_nei_2008_pm10_sum_10000  log_nei_2008_pm10_sum_15000  
                 -3.436e-02                    5.224e-01                    1.829e-01                   -2.355e-02  
log_nei_2008_pm10_sum_25000               popdens_county                 popdens_zcta                         nohs  
                  2.403e-02                    2.203e-05                   -2.132e-06                   -2.983e+00  
                     somehs                           hs                  somecollege                    associate  
                 -2.956e+00                   -2.962e+00                   -2.967e+00                   -2.999e+00  
                   bachelor                         grad                          pov                    hs_orless  
                 -2.979e+00                   -2.978e+00                    1.859e-03                           NA  
                    urc2006                          aod             state_California           city_Not.in.a.city  
                  2.577e-01                    1.535e-02                    3.114e+00                   -4.250e-02  
assessing the model fit
wflowoutput <- PM_wflow_fit %>% 
  pull_workflow_fit() %>% 
  broom::tidy() 

wflowoutput
important
PM_wflow_fit %>% 
  pull_workflow_fit() %>% 
  vip::vip(num_features = 10)

model performance

Machine learning (ML) is an optimization problem that tries to minimize the distance between our predicted outcome \(\hat Y = f(X)\) and actual outcome \(Y\) using our features (or predictor variables) \(X\) as input to a function \(f\) that we want to estimate.

As our goal in this section is to assess overall model performance,

wf_fit <- PM_wflow_fit %>% 
  pull_workflow_fit()

First, let’s pull out our predicted outcome values \(\hat Y = f(X)\)


wf_fitted_values <- wf_fit$fit$fitted.values
head(wf_fitted_values)
        1         2         3         4         5         6 
12.186782  9.139406 12.646119 10.377628 11.909934  9.520860 
# or

wf_fitted_values <- 
  broom::augment(wf_fit$fit, data = preproc_train) %>% 
  select(value, .fitted:.std.resid)

wf_fitted_values  #see .fitted variable

Or use the predict() fucntion. we use the actual workflow here, we can (and actually need to) use the raw data instead of the preprocessed data

Now, we can compare the predicted outcome values (or fitted values) \(\hat Y\) to the actual outcome values \(Y\) that we observed:

library(ggplot2)
wf_fitted_values %>% 
  ggplot(aes(x =  value, y = .fitted)) + 
  geom_point() + 
  xlab("actual outcome values") + 
  ylab("predicted outcome values")

Next, let’s use different distance functions \(d(\cdot)\) to assess how far off our predicted outcome \(\hat Y = f(X)\) and actual outcome \(Y\) values are from each other:

\[d(Y- \hat Y)\] There are entire scholarly fields of research dedicated to identifying different distance metrics \(d(\cdot)\) for machine learning applications. However, we will focus on root mean squared error ( rmse )

\[ RMSE = \sqrt{\frac{\sum_{i=0}^n(\hat y = y_t)^2}{n}}\] One way to calculate these metrics within the tidymodels framework is to use the yardstick package using the metrics() function.

We also intend to perform cross validation, so we will now split the training data further using the vfold_cv() function of the rsample package.

set.seed(1234)

vfold_pm <- rsample::vfold_cv(data = train_pm, v = 10)
vfold_pm
#  10-fold cross-validation 
pull(vfold_pm, splits)
[[1]]
<Analysis/Assess/Total>
<525/59/584>

[[2]]
<Analysis/Assess/Total>
<525/59/584>

[[3]]
<Analysis/Assess/Total>
<525/59/584>

[[4]]
<Analysis/Assess/Total>
<525/59/584>

[[5]]
<Analysis/Assess/Total>
<526/58/584>

[[6]]
<Analysis/Assess/Total>
<526/58/584>

[[7]]
<Analysis/Assess/Total>
<526/58/584>

[[8]]
<Analysis/Assess/Total>
<526/58/584>

[[9]]
<Analysis/Assess/Total>
<526/58/584>

[[10]]
<Analysis/Assess/Total>
<526/58/584>

We can fit the model to our cross validation folds using the fit_resamples() function of the tune package, by specifying our workflow object and the cross validation fold object we just created. See here for more information.

set.seed(122)
resample_fit <- tune::fit_resamples(PM_wflow, vfold_pm)
! Fold01: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 330...
! Fold01: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Forest, Niagara, Sp...
! Fold02: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 328...
! Fold02: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Macon, Arlington, C...
! Fold03: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 331...
! Fold03: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Oconee, Cobb, Vigo,...
! Fold04: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 324...
! Fold04: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Allen, McDowell, Ch...
! Fold05: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 328...
! Fold05: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Nevada, There are n...
! Fold06: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 326...
! Fold06: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Yuma, Flathead, Gra...
! Fold07: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 331...
! Fold07: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Hampton City, Rando...
! Fold08: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 328...
! Fold08: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Athens, Cabell, Dou...
! Fold09: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 331...
! Fold09: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Maine, North Dakota...
! Fold10: preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 329...
! Fold10: preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Stark, St. Lucie, A...

We can now take a look at various performance metrics based on the fit of our cross validation “resamples.”

To do this we will use the collect_metrics() function of the tune package. This will show us the mean of the accuracy estimate of the different cross validation folds.

resample_fit
Warning: This tuning result has notes. Example notes on model fitting include:
preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Macon, Arlington, Champaign, Citrus, Klamath, Cumberland, Ventura, Monongalia, Dona Ana, Ottawa, Tuscaloosa, Edgecombe, Peoria, Macomb, Teton, Tooele, St. Lawrence, Lincoln, Gibson, prediction from a rank-deficient fit may be misleading
preprocessor 1/1: the standard deviation is zero, The correlation matrix has missing values. 326 columns were excluded from the filter.
preprocessor 1/1, model 1/1 (predictions): There are new levels in a factor: Hampton City, Randolph, Hinds, Codington, Dauphin, Bennington, Brookings, Raleigh, Passaic, Duplin, Erie, Page, Bullitt, Stanislaus, Cameron, Lucas, Davis, Santa Clara, Mayes, White, DeSoto, Chautauqua, Terrebonne, Harford, Spokane, La Salle, prediction from a rank-deficient fit may be misleading
# Resampling results
# 10-fold cross-validation 

Random forest

RF_rec <- recipe(train_pm) %>%
    update_role(everything(), new_role = "predictor")%>%
    update_role(value, new_role = "outcome")%>%
    update_role(id, new_role = "id variable") %>%
    update_role("fips", new_role = "county id") %>%
    step_novel("state") %>%
    step_string2factor("state", "county", "city") %>%
    step_rm("county") %>%
    step_rm("zcta") %>%
    step_corr(all_numeric())%>%
    step_nzv(all_numeric())
PMtree_model <- 
  parsnip::rand_forest(mtry = 10, min_n = 4)
PMtree_model
Random Forest Model Specification (unknown)

Main Arguments:
  mtry = 10
  min_n = 4
library(randomForest)
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.

Attaching package: ‘randomForest’

The following object is masked from ‘package:dplyr’:

    combine

The following object is masked from ‘package:ggplot2’:

    margin
RF_PM_model <- 
  PMtree_model %>%
  set_engine("randomForest") %>%
  set_mode("regression")

RF_PM_model
Random Forest Model Specification (regression)

Main Arguments:
  mtry = 10
  min_n = 4

Computational engine: randomForest 
RF_wflow <- workflows::workflow() %>%
            workflows::add_recipe(RF_rec) %>%
            workflows::add_model(RF_PM_model)
RF_wflow
== Workflow ============================================================================================
Preprocessor: Recipe
Model: rand_forest()

-- Preprocessor ----------------------------------------------------------------------------------------
6 Recipe Steps

* step_novel()
* step_string2factor()
* step_rm()
* step_rm()
* step_corr()
* step_nzv()

-- Model -----------------------------------------------------------------------------------------------
Random Forest Model Specification (regression)

Main Arguments:
  mtry = 10
  min_n = 4

Computational engine: randomForest 
RF_wflow_fit <- parsnip::fit(RF_wflow, data = train_pm)
RF_wflow_fit
== Workflow [trained] ==================================================================================
Preprocessor: Recipe
Model: rand_forest()

-- Preprocessor ----------------------------------------------------------------------------------------
6 Recipe Steps

* step_novel()
* step_string2factor()
* step_rm()
* step_rm()
* step_corr()
* step_nzv()

-- Model -----------------------------------------------------------------------------------------------

Call:
 randomForest(x = maybe_data_frame(x), y = y, mtry = min_cols(~10,      x), nodesize = min_rows(~4, x)) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 10

          Mean of squared residuals: 2.738
                    % Var explained: 57.67
RF_wflow_fit %>% 
  pull_workflow_fit() %>% 
  vip::vip(num_features = 10)

set.seed(456)
resample_RF_fit <- tune::fit_resamples(RF_wflow, vfold_pm)
collect_metrics(resample_RF_fit)
RF_PM_model <- 
  parsnip::rand_forest(mtry = 10, min_n = 4) %>%
  set_engine("randomForest") %>%
  set_mode("regression")

RF_PM_model
Random Forest Model Specification (regression)

Main Arguments:
  mtry = 10
  min_n = 4

Computational engine: randomForest 

hyperparameters

tune_RF_model <- rand_forest(mtry = tune(), min_n = tune()) %>%
  set_engine("randomForest") %>%
  set_mode("regression")
    

tune_RF_model
Random Forest Model Specification (regression)

Main Arguments:
  mtry = tune()
  min_n = tune()

Computational engine: randomForest 

add to workflow

RF_tune_wflow <- workflows::workflow() %>%
            workflows::add_recipe(RF_rec) %>%
            workflows::add_model(tune_RF_model)
RF_tune_wflow
== Workflow ============================================================================================
Preprocessor: Recipe
Model: rand_forest()

-- Preprocessor ----------------------------------------------------------------------------------------
6 Recipe Steps

* step_novel()
* step_string2factor()
* step_rm()
* step_rm()
* step_corr()
* step_nzv()

-- Model -----------------------------------------------------------------------------------------------
Random Forest Model Specification (regression)

Main Arguments:
  mtry = tune()
  min_n = tune()

Computational engine: randomForest 

Tune in parallel

detectCores()
[1] 12

twelve cores

tune_RF_results <- tune_grid(object = RF_tune_wflow, resamples = vfold_pm, grid = 20)
i Creating pre-processing data to finalize unknown parameter: mtry
tune_RF_results
# Tuning results
# 10-fold cross-validation 
tune_RF_results%>%
  collect_metrics() %>%
  head()
show_best(tune_RF_results, metric = "rmse", n =1)
tuned_RF_values<- select_best(tune_RF_results, "rmse")
tuned_RF_values
RF_tuned_wflow <-RF_tune_wflow %>%
  tune::finalize_workflow(tuned_RF_values)
RF_tuned_wflow
== Workflow ============================================================================================
Preprocessor: Recipe
Model: rand_forest()

-- Preprocessor ----------------------------------------------------------------------------------------
6 Recipe Steps

* step_novel()
* step_string2factor()
* step_rm()
* step_rm()
* step_corr()
* step_nzv()

-- Model -----------------------------------------------------------------------------------------------
Random Forest Model Specification (regression)

Main Arguments:
  mtry = 15
  min_n = 3

Computational engine: randomForest 
test_predictions <-collect_predictions(overallfit)
test_predictions
test_predictions %>% 
  ggplot(aes(x =  value, y = .pred)) + 
  geom_point() + 
  xlab("actual outcome values") + 
  ylab("predicted outcome values") +
  geom_smooth(method = "lm")
`geom_smooth()` using formula 'y ~ x'

---
title: "case studies"
output: html_notebook
---

### Load library packages



```{r}
library(tidyverse)
library(skimr)
library(tidymodels)
library(corrplot)
```



## 5.14 Case studies

https://jhudatascience.org/tidyversecourse/model.html#case-studies-4

### Predicting Annual Air Pollution (case study 1)


#### the data

There are 48 predictors with values for 876 monitors (observations).  The data comes from the US Environmental Protection Agency (EPA), the National Aeronautics and Space Administration (NASA), the US Census, and the National Center for Health Statistics (NCHS).

https://jhudatascience.org/tidyversecourse/model.html#data-import

```{r}
pm <- read_csv("data/pm25_data.csv")
glimpse(pm)
```

```{r}
pm <-pm %>%
  mutate(across(c(id, fips, zcta), as_factor)) 
```


```{r}
skim(pm)
```


```{r fig.height=12, fig.width=12}
PM_cor <- cor(pm %>% dplyr::select_if(is.numeric))
corrplot::corrplot(PM_cor, tl.cex = 0.5)
```


### split the data

tidymodels

```{r}
set.seed(1234)
pm_split <- rsample::initial_split(data = pm, prop = 2/3)
pm_split
```

Importantly the initial_split() function only determines what rows of our pm dataframe should be assigned for training or testing, it does not actually split the data.

To extract the testing and training data we can use the training() and testing() functions also of the rsample package.


```{r}
train_pm <-rsample::training(pm_split)
test_pm <-rsample::testing(pm_split)
```

### recipe

https://recipes.tidymodels.org/reference/index.html

```{r}
# library(tidymodels)
simple_rec <- train_pm %>%
  recipes::recipe(value ~ .)

simple_rec
```

#### update role 

of `id` which is not a predictor

```{r}
simple_rec <- train_pm %>%
  recipes::recipe(value ~ .) %>%
  recipes::update_role(id, new_role = "id variable")

simple_rec
```

#### steps


https://recipes.tidymodels.org/reference/index.html

All of the step functions look like step_*() with the * replaced with a name, except for the check functions which look like check_*().

There are several ways to select what variables to apply steps to:

1. Using tidyselect methods: contains(), matches(), starts_with(), ends_with(), everything(), num_range()
1. Using the type: all_nominal(), all_numeric() , has_type()
1. Using the role: all_predictors(), all_outcomes(), has_role()
1. Using the name - use the actual name of the variable/variables of interest


> We want to dummy encode our categorical variables so that they are numeric as we plan to use a linear regression for our model.

We will use the one-hot encoding means that we do not simply encode our categorical variables numerically, as our numeric assignments can be interpreted by algorithms as having a particular rank or order. Instead, binary variables made of 1s and 0s are used to arbitrarily assign a numeric value that has no apparent order.


```{r}
simple_rec %>%
  step_dummy(state, county, city, zcta, one_hot = TRUE)
```


```{r}
simple_rec %>%
  update_role("fips", new_role = "county id")
```

##### step_corr - remove redundant correlations

We also want to remove variables that appear to be redundant and are highly correlated with others, as we know from our exploratory data analysis that many of our variables are correlated with one another. We can do this using the step_corr() function.


```{r}
simple_rec %>%
  step_corr(all_predictors(), - CMAQ, - aod)
```

##### step_nzv -- remove variables with near-zero variance


```{r}
simple_rec %>%
  step_nzv(all_predictors(), - CMAQ, - aod)
```

##### Putting it all together

```{r}
simple_rec <- train_pm %>%
  recipes::recipe(value ~ .) %>%
  recipes::update_role(id, new_role = "id variable") %>%
  update_role("fips", new_role = "county id") %>%
  step_dummy(state, county, city, zcta, one_hot = TRUE) %>%
  step_corr(all_predictors(), - CMAQ, - aod)%>%
  step_nzv(all_predictors(), - CMAQ, - aod)
  
simple_rec
```
#### Preprocessing

```{r}
prepped_rec <- prep(simple_rec, verbose = TRUE, retain = TRUE)
```


##### bake

we retained our preprocessed training data (i.e. prep(retain=TRUE)), we can take a look at it by using the bake() function of the recipes package

requires the new_data = NULL argument when we are using the training data.


```{r}
preproc_train <- bake(prepped_rec, new_data = NULL)
glimpse(preproc_train)
```


###### extracting

```{r}
baked_test_pm <- recipes::bake(prepped_rec, new_data = test_pm)
glimpse(baked_test_pm)
```


###### examine

```{r}
traincities <- train_pm %>% distinct(city)
testcities <- test_pm %>% distinct(city)

#get the number of cities that were different
dim(dplyr::setdiff(traincities, testcities))

#get the number of cities that overlapped
dim(dplyr::intersect(traincities, testcities))
```



So, let’s go back to our pm dataset and modify the city variable to just be values of in a city or not in a city using the case_when() function of dplyr. This function allows you to vectorize multiple if_else() statements.

```{r}
pm %>% #count(city, sort = TRUE)
  mutate(city = case_when(city == "Not in a city" ~ "Not in a city",
                          city != "Not in a city" ~ "In a city")) %>% 
  select(state, county, city)
```

##### redo

```{r}
pm <- pm %>% 
  mutate(city = case_when(city == "Not in a city" ~ "Not in a city",
                          city != "Not in a city" ~ "In a city"))

set.seed(1234) # same seed as before
pm_split <-rsample::initial_split(data = pm, prop = 2/3)
pm_split
```

```{r}
 train_pm <-rsample::training(pm_split)
 test_pm <-rsample::testing(pm_split)
```

```{r}
novel_rec <-train_pm %>%
    recipe() %>%
    update_role(everything(), new_role = "predictor") %>%
    update_role(value, new_role = "outcome") %>%
    update_role(id, new_role = "id variable") %>%
    update_role("fips", new_role = "county id") %>%
    step_dummy(state, county, city, zcta, one_hot = TRUE) %>%
    step_corr(all_numeric()) %>%
    step_nzv(all_numeric()) 
```

Now we will check the preprocessed data again to see if we still have NA values

```{r}
prepped_rec <- prep(novel_rec, verbose = TRUE, retain = TRUE)
```

```{r}
preproc_train <- bake(prepped_rec, new_data = NULL)
glimpse(preproc_train)
```

```{r}
baked_test_pm <- recipes::bake(prepped_rec, new_data = test_pm)
```

```{r}
glimpse(baked_test_pm)
```

Great, now we no longer have NA values!

##### specify the model

 For our case, we are going to start our analysis with a linear regression but we will demonstrate how we can try different models.

```{r}
PM_model <- parsnip::linear_reg() # PM was used in the name for particulate matter
PM_model
```

OK. So far, all we have defined is that we want to use a linear regression…
Let’s tell parsnip more about what we want.

```{r}
lm_PM_model <- 
  PM_model  %>%
  parsnip::set_engine("lm")

lm_PM_model
```

Here, we aim to predict the air pollution. You can do this with the set_mode() function of the parsnip package, by using either set_mode("classification") or set_mode("regression").


```{r}
lm_PM_model <- 
  PM_model  %>%
  parsnip::set_engine("lm") %>%
  set_mode("regression")

lm_PM_model
```

```{r}
PM_wflow <-workflows::workflow() %>%
           workflows::add_recipe(novel_rec) %>%
           workflows::add_model(lm_PM_model)
PM_wflow
```


```{r}
PM_wflow_fit <- parsnip::fit(PM_wflow, data = train_pm)
PM_wflow_fit
```

##### assessing the model fit

```{r}
wflowoutput <- PM_wflow_fit %>% 
  pull_workflow_fit() %>% 
  broom::tidy() 

wflowoutput
```

##### important

```{r}
PM_wflow_fit %>% 
  pull_workflow_fit() %>% 
  vip::vip(num_features = 10)
```

#### model performance

Machine learning (ML) is an optimization problem that tries to minimize the distance between our predicted outcome  $\hat Y = f(X)$ and actual outcome  $Y$ using our features (or predictor variables)  $X$ as input to a function  $f$ that we want to estimate.

**As our goal in this section is to assess overall model performance, **



```{r}
wf_fit <- PM_wflow_fit %>% 
  pull_workflow_fit()
```

First, let’s pull out our predicted outcome values $\hat Y = f(X)$


```{r}

wf_fitted_values <- wf_fit$fit$fitted.values
head(wf_fitted_values)

# or

wf_fitted_values <- 
  broom::augment(wf_fit$fit, data = preproc_train) %>% 
  select(value, .fitted:.std.resid)

wf_fitted_values  #see .fitted variable
```


Or use the `predict()` fucntion.   we use the actual workflow here, we can (and actually need to) use the raw data instead of the preprocessed data

```{r}
values_pred_train <- 
  predict(PM_wflow_fit, train_pm) %>% 
  bind_cols(train_pm %>% select(value, fips, county, id)) 

values_pred_train  #see .pred column
```

Now, we can compare the predicted outcome values (or fitted values)  $\hat Y$ to the actual outcome values  $Y$ that we observed:

```{r}
library(ggplot2)
wf_fitted_values %>% 
  ggplot(aes(x =  value, y = .fitted)) + 
  geom_point() + 
  xlab("actual outcome values") + 
  ylab("predicted outcome values")
```

Next, let’s use different distance functions  $d(\cdot)$ to assess how far off our predicted outcome  $\hat Y = f(X)$ and actual outcome  $Y$ values are from each other: 

$$d(Y- \hat Y)$$
There are entire scholarly fields of research dedicated to identifying different distance metrics  $d(\cdot)$ for machine learning applications. However, we will focus on root mean squared error `( rmse )`

$$ RMSE = \sqrt{\frac{\sum_{i=0}^n(\hat y = y_t)^2}{n}}$$
One way to calculate these metrics within the `tidymodels` framework is to use the `yardstick` package using the `metrics()` function.


```{r}
yardstick::metrics(wf_fitted_values, 
                   truth = value, estimate = .fitted)
```


We also intend to perform cross validation, so we will now split the training data further using the vfold_cv() function of the rsample package.


```{r}
set.seed(1234)

vfold_pm <- rsample::vfold_cv(data = train_pm, v = 10)
vfold_pm
```

```{r}
pull(vfold_pm, splits)
```

We can fit the model to our cross validation folds using the fit_resamples() function of the tune package, by specifying our workflow object and the cross validation fold object we just created. See [here](https://tidymodels.github.io/tune/reference/fit_resamples.html) for more information.

```{r}
set.seed(122)
resample_fit <- tune::fit_resamples(PM_wflow, vfold_pm)
```


We can now take a look at various performance metrics based on the fit of our cross validation “resamples.”

To do this we will use the collect_metrics() function of the tune package. This will show us the mean of the accuracy estimate of the different cross validation folds.


```{r}
resample_fit
```

```{r}
collect_metrics(resample_fit)
```


### Random forest

```{r}
RF_rec <- recipe(train_pm) %>%
    update_role(everything(), new_role = "predictor")%>%
    update_role(value, new_role = "outcome")%>%
    update_role(id, new_role = "id variable") %>%
    update_role("fips", new_role = "county id") %>%
    step_novel("state") %>%
    step_string2factor("state", "county", "city") %>%
    step_rm("county") %>%
    step_rm("zcta") %>%
    step_corr(all_numeric())%>%
    step_nzv(all_numeric())
```


```{r}
PMtree_model <- 
  parsnip::rand_forest(mtry = 10, min_n = 4)
PMtree_model
```


```{r}
library(randomForest)
```

```{r}
RF_PM_model <- 
  PMtree_model %>%
  set_engine("randomForest") %>%
  set_mode("regression")

RF_PM_model
```

```{r}
RF_wflow <- workflows::workflow() %>%
            workflows::add_recipe(RF_rec) %>%
            workflows::add_model(RF_PM_model)
RF_wflow
```


```{r}
RF_wflow_fit <- parsnip::fit(RF_wflow, data = train_pm)
RF_wflow_fit
```

```{r}
RF_wflow_fit %>% 
  pull_workflow_fit() %>% 
  vip::vip(num_features = 10)
```

```{r}
set.seed(456)
resample_RF_fit <- tune::fit_resamples(RF_wflow, vfold_pm)
collect_metrics(resample_RF_fit)
```

```{r}
RF_PM_model <- 
  parsnip::rand_forest(mtry = 10, min_n = 4) %>%
  set_engine("randomForest") %>%
  set_mode("regression")

RF_PM_model
```

#### hyperparameters

```{r}
tune_RF_model <- rand_forest(mtry = tune(), min_n = tune()) %>%
  set_engine("randomForest") %>%
  set_mode("regression")
    

tune_RF_model
```

add to workflow

```{r}
RF_tune_wflow <- workflows::workflow() %>%
            workflows::add_recipe(RF_rec) %>%
            workflows::add_model(tune_RF_model)
RF_tune_wflow
```

Tune in parallel

```{r}
library(parallel)
detectCores()
```

twelve cores

```{r}
doParallel::registerDoParallel(cores=6)
set.seed(123)
tune_RF_results <- tune_grid(object = RF_tune_wflow, resamples = vfold_pm, grid = 20)
```

```{r}
tune_RF_results
```

```{r}
tune_RF_results%>%
  collect_metrics() %>%
  head()
```

```{r}
show_best(tune_RF_results, metric = "rmse", n =1)
```

```{r}
tuned_RF_values<- select_best(tune_RF_results, "rmse")
tuned_RF_values
```

```{r}
RF_tuned_wflow <-RF_tune_wflow %>%
  tune::finalize_workflow(tuned_RF_values)
```

```{r}
RF_tuned_wflow
```


```{r}
overallfit <-tune::last_fit(RF_tuned_wflow, pm_split)
 # or
overallfit <-RF_wflow %>%
  tune::last_fit(pm_split)

collect_metrics(overallfit)
```


```{r}
test_predictions <-collect_predictions(overallfit)
test_predictions
```


```{r}
test_predictions %>% 
  ggplot(aes(x =  value, y = .pred)) + 
  geom_point() + 
  xlab("actual outcome values") + 
  ylab("predicted outcome values") +
  geom_smooth(method = "lm")
```

